This files contains an example of tuning an XGBoost model with BayesSearchCV.
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train, return_style=True)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train, return_style=True)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])
array([[0., 2.],
[2., 2.],
[9., 1.],
...,
[9., 3.],
[6., 4.],
[6., 2.]])
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)
['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'] ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
numeric_pipeline = Pipeline([
#tune whether or not we want to impute or simply remove rows with missing values
('imputer', hlp.sklearn_pipeline.TransformerChooser()),
# this is here so that we can select between MinMax and Scaler
# if this pipeline is ran in a context outside of tuning, no transformation will take place
('scaler', hlp.sklearn_pipeline.TransformerChooser()),
])
non_numeric_pipeline = Pipeline([
('encoder', hlp.sklearn_pipeline.TransformerChooser()),
])
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
('numeric', numeric_pipeline, numeric_columns),
('non_numeric', non_numeric_pipeline, non_numeric_columns)
])
XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded on Apple Silicon (ARM)
https://github.com/dmlc/xgboost/issues/6909
pip install --upgrade --force-reinstall xgboost --no-binary xgboost -v
from xgboost import XGBClassifier
#from sklearn.decomposition import KernelPCA
xgb_model = XGBClassifier(random_state=42,
eval_metric='logloss',
use_label_encoder=False)
full_pipeline = Pipeline([
('prep', transformations_pipeline),
# ('pca', KernelPCA()),
# ('pca', hlp.sklearn_pipeline.TransformerChooser()),
('model', xgb_model)
])
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps
{'prep': ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser()),
('scaler',
TransformerChooser())]),
['duration', 'credit_amount',
'installment_commitment', 'residence_since',
'age', 'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps=[('encoder',
TransformerChooser())]),
['checking_status', 'credit_history',
'purpose', 'savings_status', 'employment',
'personal_status', 'other_parties',
'property_magnitude', 'other_payment_plans',
'housing', 'job', 'own_telephone',
'foreign_worker'])]),
'model': XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
colsample_bynode=None, colsample_bytree=None,
enable_categorical=False, eval_metric='logloss', gamma=None,
gpu_id=None, importance_type=None, interaction_constraints=None,
learning_rate=None, max_delta_step=None, max_depth=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, reg_alpha=None, reg_lambda=None,
scale_pos_weight=None, subsample=None, tree_method=None,
use_label_encoder=False, validate_parameters=None,
verbosity=None)}
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score # , roc_auc_score
from sklearn.metrics import SCORERS
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
# https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
'ROC/AUC': SCORERS['roc_auc'],
'F1': make_scorer(f1_score, greater_is_better=True),
'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}
num_folds = 5
num_repeats = 2
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663
max_depth: 3–10 n_estimators: 100 (lots of observations) to 1000 (few observations) learning_rate: 0.01–0.3 colsample_bytree: 0.5–1 subsample: 0.6–1
Then, you can focus on optimizing max_depth and n_estimators. You can then play along with the learning_rate, and increase it to speed up the model without decreasing the performances. If it becomes faster without losing in performances, you can increase the number of estimators to try to increase the performances.
search_space = {
'prep__numeric__imputer__transformer': Categorical([SimpleImputer(strategy='mean')]),
'prep__numeric__scaler__transformer': Categorical([
None,
MinMaxScaler(),
StandardScaler()
]),
'prep__non_numeric__encoder__transformer': Categorical([
# None,
OneHotEncoder(),
hlp.sklearn_pipeline.CustomOrdinalEncoder()
]),
# 'pca__transformer': Categorical([
# None,
# KernelPCA(n_components=5, kernel='rbf'),
# KernelPCA(n_components=5, kernel='sigmoid'),
# KernelPCA(n_components=5, kernel='linear'),
# ]),
# 'pca__n_components': Integer(3, X_train.shape[1]),
# 'pca__gamma': Real(0.03, 0.05),
# 'pca__kernel': Categorical(['rbf', 'sigmoid']),
'model__max_depth': Integer(3, 10),
'model__n_estimators': Integer(50, 2000),
'model__learning_rate': Real(0.01, 0.3),
'model__colsample_bytree': Real(0.01, 1),
'model__subsample': Real(0.1, 1),
}
bayes_search = BayesSearchCV(
estimator=full_pipeline,
search_spaces=search_space,
n_iter=50,
cv=RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats),
scoring='roc_auc',
#return_train_score=True,
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
del search_space
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 153.129 seconds; 2.6 minutes
print(bayes_search.cv_results_)
{'mean_fit_time': array([0.89816132, 3.0764225 , 0.82913582, 4.30177827, 1.92396953,
1.45830369, 1.4233129 , 1.88282964, 1.68454292, 0.18679664,
1.01264901, 0.14172795, 0.45804882, 2.67004008, 0.13521459,
1.36825943, 0.46995008, 0.65330856, 0.19113827, 0.47605207,
2.28726478, 0.19134943, 0.0461545 , 0.1843302 , 0.88133376,
0.03310969, 0.43091331, 0.23171203, 0.43778236, 0.34669089,
0.08219891, 0.48769341, 0.05991201, 0.16163681, 2.77276096,
1.46445477, 0.29412501, 1.91430132, 1.87365606, 0.02733583,
0.38780365, 0.02459159, 0.02633662, 1.27306516, 0.40463541,
2.03928359, 0.56104724, 0.35019045, 2.38269603, 0.28807607]), 'std_fit_time': array([0.12769682, 0.47720331, 0.15277264, 0.35735077, 0.35592962,
0.34551636, 0.22785558, 0.32718645, 0.28931432, 0.01723838,
0.20799813, 0.06342528, 0.06556805, 0.4216427 , 0.03405872,
0.28196644, 0.06232091, 0.09331994, 0.0306277 , 0.09281915,
0.40074423, 0.02865588, 0.01673284, 0.03151839, 0.14708791,
0.01246951, 0.06361268, 0.02759997, 0.0729468 , 0.05012863,
0.00950529, 0.06391736, 0.01506224, 0.01157997, 0.38842702,
0.29932999, 0.05987531, 0.31562562, 0.39461849, 0.00603598,
0.06480694, 0.00561323, 0.00845572, 0.19382381, 0.07326875,
0.30355297, 0.09645451, 0.05036612, 0.43254191, 0.04968032]), 'mean_score_time': array([0.02877641, 0.04885268, 0.01741676, 0.05621626, 0.0550607 ,
0.06915152, 0.04726262, 0.04520524, 0.03155162, 0.01327584,
0.02386544, 0.06439781, 0.02831004, 0.101421 , 0.01564195,
0.03435686, 0.06001642, 0.01800482, 0.02854214, 0.02685826,
0.07926798, 0.00927567, 0.02098334, 0.02165785, 0.02238741,
0.0114017 , 0.01514642, 0.03219085, 0.02864709, 0.01525116,
0.0088814 , 0.01791248, 0.01071515, 0.00820458, 0.01437714,
0.02708569, 0.01435773, 0.02059252, 0.02176239, 0.00819659,
0.02069819, 0.00785918, 0.00737405, 0.02391355, 0.01872656,
0.02436416, 0.01369231, 0.01971207, 0.02507064, 0.01405826]), 'std_score_time': array([0.01159442, 0.02512903, 0.00727215, 0.01408033, 0.01962229,
0.02864968, 0.02185857, 0.0248278 , 0.02412403, 0.00438455,
0.00633402, 0.04966949, 0.0071095 , 0.03719042, 0.00661942,
0.0189179 , 0.02734709, 0.00953984, 0.01259339, 0.00919246,
0.03136404, 0.00449351, 0.00848865, 0.00810228, 0.00609113,
0.0037944 , 0.00452725, 0.01589615, 0.01242696, 0.00832405,
0.00296361, 0.00368812, 0.00399344, 0.00218362, 0.00491723,
0.00873267, 0.00572117, 0.00643798, 0.00582292, 0.0037005 ,
0.01098078, 0.00273955, 0.0035422 , 0.00637941, 0.01314853,
0.00720598, 0.0055979 , 0.01104573, 0.01221282, 0.00515879]), 'param_model__colsample_bytree': masked_array(data=[0.4160029192647807, 0.8390144719977516,
0.4503841871781403, 0.8142720284737898,
0.8015579071911014, 0.7366877378057127,
0.6209085649172932, 0.5479690370134094,
0.955923206446829, 0.013594004182195795,
0.08421115929901542, 0.01, 0.01, 0.6065413967740271,
0.048335768627225786, 0.5001203222226797, 0.01, 0.01,
0.7207725616169308, 0.01, 0.8774037457099605, 0.01,
0.01, 0.01, 1.0, 0.01, 0.3303413938503384, 0.01, 1.0,
0.013461851806932441, 1.0, 1.0, 0.01, 0.01, 1.0, 1.0,
0.01, 1.0, 1.0, 0.01, 0.01, 0.01, 0.01,
0.22941721714014698, 0.01, 0.8958307524735796, 0.01,
0.01, 1.0, 0.01],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__learning_rate': masked_array(data=[0.22104046552142426, 0.26616143044045004,
0.27642953128411935, 0.059842752805002605,
0.13702846406786778, 0.2824172239048038,
0.23485566986008594, 0.27680267566682176,
0.21318021894019612, 0.24717070570248795, 0.01, 0.01,
0.01, 0.01, 0.01136065688268026, 0.03201179945791919,
0.09126402168435382, 0.01, 0.02351939030602585,
0.017022513219900365, 0.01, 0.01, 0.01,
0.020278132151339716, 0.020499061204489202,
0.04191540390742086, 0.017818599233791366,
0.02271579268239867, 0.07175271006980719,
0.17887475205606373, 0.1649386600945301,
0.17938529269056155, 0.141191195815678,
0.13414656781324166, 0.1083323273712242,
0.1468935346113331, 0.14760455116616997, 0.01, 0.01,
0.15122959954685214, 0.1598701464790812,
0.14344994031810823, 0.16451368551598844, 0.01,
0.06377819322440227, 0.01, 0.01, 0.01,
0.03854828611306992, 0.01],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__max_depth': masked_array(data=[10, 5, 4, 7, 7, 4, 6, 6, 9, 8, 10, 10, 10, 10, 9, 10,
10, 9, 9, 10, 10, 10, 5, 9, 8, 9, 9, 10, 3, 3, 3, 5, 3,
3, 3, 3, 6, 10, 5, 3, 3, 7, 3, 7, 4, 8, 3, 5, 3, 5],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__n_estimators': masked_array(data=[666, 1905, 895, 1615, 1446, 417, 1755, 1683, 861, 546,
1823, 50, 2000, 2000, 285, 1907, 1785, 1922, 95, 1764,
984, 727, 50, 701, 1726, 50, 712, 566, 650, 1268, 103,
1188, 135, 619, 2000, 1895, 778, 2000, 1628, 50, 1551,
50, 50, 1750, 1485, 1075, 1925, 1114, 1649, 912],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__subsample': masked_array(data=[0.7031331534420412, 0.8777151239184556,
0.2691596677306137, 0.5707470952489453,
0.913466865256018, 0.8204529083110704,
0.633357707775011, 0.3916539840559363,
0.44295828831774353, 0.6207979026055463,
0.22376364645022406, 1.0, 0.1, 0.43119463810964986,
0.11718352911818523, 0.10021159551100563,
0.15940165019958488, 0.38002580008793974,
0.6568798506341175, 0.1, 1.0, 0.1895505870635225, 0.1,
0.1, 0.1, 0.9026052157094342, 0.1, 1.0,
0.34033406734215377, 0.8768181168587077, 0.1, 0.1, 1.0,
0.3277821389372305, 1.0, 1.0, 1.0, 0.1, 0.1,
0.716920752438039, 0.8129377909193679,
0.739527682162506, 1.0, 0.2218705814625066,
0.26189027287809796, 0.2610914788988883,
0.3043110505682685, 0.20654326436152715,
0.3844693067305833, 0.3345201472294166],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__non_numeric__encoder__transformer': masked_array(data=[OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__numeric__imputer__transformer': masked_array(data=[SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__numeric__scaler__transformer': masked_array(data=[MinMaxScaler(), MinMaxScaler(), StandardScaler(),
StandardScaler(), MinMaxScaler(), MinMaxScaler(),
MinMaxScaler(), None, MinMaxScaler(), None,
MinMaxScaler(), None, StandardScaler(),
StandardScaler(), MinMaxScaler(), MinMaxScaler(), None,
StandardScaler(), MinMaxScaler(), None, MinMaxScaler(),
MinMaxScaler(), None, StandardScaler(), MinMaxScaler(),
StandardScaler(), StandardScaler(), MinMaxScaler(),
MinMaxScaler(), MinMaxScaler(), StandardScaler(), None,
StandardScaler(), StandardScaler(), None, None,
StandardScaler(), None, None, None, StandardScaler(),
StandardScaler(), StandardScaler(), StandardScaler(),
StandardScaler(), MinMaxScaler(), StandardScaler(),
None, StandardScaler(), StandardScaler()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'params': [OrderedDict([('model__colsample_bytree', 0.4160029192647807), ('model__learning_rate', 0.22104046552142426), ('model__max_depth', 10), ('model__n_estimators', 666), ('model__subsample', 0.7031331534420412), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.8390144719977516), ('model__learning_rate', 0.26616143044045004), ('model__max_depth', 5), ('model__n_estimators', 1905), ('model__subsample', 0.8777151239184556), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.4503841871781403), ('model__learning_rate', 0.27642953128411935), ('model__max_depth', 4), ('model__n_estimators', 895), ('model__subsample', 0.2691596677306137), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.8142720284737898), ('model__learning_rate', 0.059842752805002605), ('model__max_depth', 7), ('model__n_estimators', 1615), ('model__subsample', 0.5707470952489453), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.8015579071911014), ('model__learning_rate', 0.13702846406786778), ('model__max_depth', 7), ('model__n_estimators', 1446), ('model__subsample', 0.913466865256018), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.7366877378057127), ('model__learning_rate', 0.2824172239048038), ('model__max_depth', 4), ('model__n_estimators', 417), ('model__subsample', 0.8204529083110704), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.6209085649172932), ('model__learning_rate', 0.23485566986008594), ('model__max_depth', 6), ('model__n_estimators', 1755), ('model__subsample', 0.633357707775011), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.5479690370134094), ('model__learning_rate', 0.27680267566682176), ('model__max_depth', 6), ('model__n_estimators', 1683), ('model__subsample', 0.3916539840559363), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.955923206446829), ('model__learning_rate', 0.21318021894019612), ('model__max_depth', 9), ('model__n_estimators', 861), ('model__subsample', 0.44295828831774353), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.013594004182195795), ('model__learning_rate', 0.24717070570248795), ('model__max_depth', 8), ('model__n_estimators', 546), ('model__subsample', 0.6207979026055463), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.08421115929901542), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 1823), ('model__subsample', 0.22376364645022406), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 2000), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.6065413967740271), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 2000), ('model__subsample', 0.43119463810964986), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.048335768627225786), ('model__learning_rate', 0.01136065688268026), ('model__max_depth', 9), ('model__n_estimators', 285), ('model__subsample', 0.11718352911818523), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.5001203222226797), ('model__learning_rate', 0.03201179945791919), ('model__max_depth', 10), ('model__n_estimators', 1907), ('model__subsample', 0.10021159551100563), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.09126402168435382), ('model__max_depth', 10), ('model__n_estimators', 1785), ('model__subsample', 0.15940165019958488), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 9), ('model__n_estimators', 1922), ('model__subsample', 0.38002580008793974), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.7207725616169308), ('model__learning_rate', 0.02351939030602585), ('model__max_depth', 9), ('model__n_estimators', 95), ('model__subsample', 0.6568798506341175), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.017022513219900365), ('model__max_depth', 10), ('model__n_estimators', 1764), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.8774037457099605), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 984), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 727), ('model__subsample', 0.1895505870635225), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 5), ('model__n_estimators', 50), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.020278132151339716), ('model__max_depth', 9), ('model__n_estimators', 701), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.020499061204489202), ('model__max_depth', 8), ('model__n_estimators', 1726), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.04191540390742086), ('model__max_depth', 9), ('model__n_estimators', 50), ('model__subsample', 0.9026052157094342), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.3303413938503384), ('model__learning_rate', 0.017818599233791366), ('model__max_depth', 9), ('model__n_estimators', 712), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.02271579268239867), ('model__max_depth', 10), ('model__n_estimators', 566), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.07175271006980719), ('model__max_depth', 3), ('model__n_estimators', 650), ('model__subsample', 0.34033406734215377), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.013461851806932441), ('model__learning_rate', 0.17887475205606373), ('model__max_depth', 3), ('model__n_estimators', 1268), ('model__subsample', 0.8768181168587077), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.1649386600945301), ('model__max_depth', 3), ('model__n_estimators', 103), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.17938529269056155), ('model__max_depth', 5), ('model__n_estimators', 1188), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.141191195815678), ('model__max_depth', 3), ('model__n_estimators', 135), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.13414656781324166), ('model__max_depth', 3), ('model__n_estimators', 619), ('model__subsample', 0.3277821389372305), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.1083323273712242), ('model__max_depth', 3), ('model__n_estimators', 2000), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.1468935346113331), ('model__max_depth', 3), ('model__n_estimators', 1895), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.14760455116616997), ('model__max_depth', 6), ('model__n_estimators', 778), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 2000), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.01), ('model__max_depth', 5), ('model__n_estimators', 1628), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.15122959954685214), ('model__max_depth', 3), ('model__n_estimators', 50), ('model__subsample', 0.716920752438039), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.1598701464790812), ('model__max_depth', 3), ('model__n_estimators', 1551), ('model__subsample', 0.8129377909193679), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.14344994031810823), ('model__max_depth', 7), ('model__n_estimators', 50), ('model__subsample', 0.739527682162506), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.16451368551598844), ('model__max_depth', 3), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.22941721714014698), ('model__learning_rate', 0.01), ('model__max_depth', 7), ('model__n_estimators', 1750), ('model__subsample', 0.2218705814625066), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.06377819322440227), ('model__max_depth', 4), ('model__n_estimators', 1485), ('model__subsample', 0.26189027287809796), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.8958307524735796), ('model__learning_rate', 0.01), ('model__max_depth', 8), ('model__n_estimators', 1075), ('model__subsample', 0.2610914788988883), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 3), ('model__n_estimators', 1925), ('model__subsample', 0.3043110505682685), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 5), ('model__n_estimators', 1114), ('model__subsample', 0.20654326436152715), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.03854828611306992), ('model__max_depth', 3), ('model__n_estimators', 1649), ('model__subsample', 0.3844693067305833), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 5), ('model__n_estimators', 912), ('model__subsample', 0.3345201472294166), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])], 'split0_test_score': array([0.66446038, 0.66216945, 0.71383648, 0.72802569, 0.74572386,
0.71035409, 0.67368232, 0.65839387, 0.73458333, 0.75503672,
0.7267958 , 0.73334639, 0.78737605, 0.72156586, 0.77518358,
0.77845192, 0.68160422, 0.75804933, 0.71703544, 0.80487351,
0.74347904, 0.75949094, 0.69549851, 0.774807 , 0.76618304,
0.72465174, 0.72272727, 0.75854701, 0.7104232 , 0.77883185,
0.71883903, 0.71645212, 0.72693452, 0.72947724, 0.72809829,
0.76265209, 0.67875744, 0.69624256, 0.76836364, 0.62399932,
0.67496082, 0.64567006, 0.75753348, 0.80654545, 0.71848291,
0.82055525, 0.77613694, 0.79347826, 0.72578348, 0.72702991]), 'split1_test_score': array([0.73455378, 0.63337054, 0.67421984, 0.66475948, 0.72966184,
0.70630878, 0.72125 , 0.67563636, 0.72005208, 0.69760748,
0.71768648, 0.73173224, 0.78365385, 0.72961777, 0.73256039,
0.66898889, 0.76638177, 0.73653245, 0.78755172, 0.75254545,
0.75892857, 0.78549696, 0.69735863, 0.75730833, 0.77916915,
0.64599483, 0.75890313, 0.67595556, 0.72456357, 0.72942949,
0.73507814, 0.65045767, 0.75858124, 0.76940247, 0.69332598,
0.68518519, 0.71707589, 0.75729469, 0.79483501, 0.75625539,
0.70554315, 0.67989583, 0.8057971 , 0.75539258, 0.74336283,
0.68875918, 0.78917071, 0.7280506 , 0.71116551, 0.80182866]), 'split2_test_score': array([0.74844662, 0.73931624, 0.72112736, 0.75743707, 0.6630689 ,
0.70926044, 0.65194805, 0.7113604 , 0.69194146, 0.69177827,
0.75070608, 0.7389226 , 0.75569801, 0.77198268, 0.74061012,
0.72331544, 0.67864734, 0.77688746, 0.72114479, 0.72596154,
0.69083072, 0.72844273, 0.69073593, 0.7628702 , 0.63212808,
0.6769943 , 0.774 , 0.70423341, 0.6832 , 0.71850876,
0.78084207, 0.66550725, 0.7657385 , 0.7250744 , 0.65355135,
0.65522356, 0.75625 , 0.75030859, 0.68908213, 0.60864583,
0.70196727, 0.61153236, 0.81621244, 0.76214734, 0.78318841,
0.79069767, 0.72740316, 0.75819088, 0.67375 , 0.79186904]), 'split3_test_score': array([0.72145455, 0.75768693, 0.70120525, 0.79341991, 0.67391304,
0.71274907, 0.67485119, 0.72801804, 0.75307359, 0.69809387,
0.78451772, 0.74332571, 0.72311668, 0.75585149, 0.75400966,
0.73779558, 0.65874396, 0.74273608, 0.76595745, 0.77221228,
0.71064935, 0.69846588, 0.7578599 , 0.70325739, 0.73796509,
0.60202691, 0.71456967, 0.79508197, 0.77873718, 0.73468599,
0.7678394 , 0.62543676, 0.80345912, 0.74782609, 0.68022886,
0.67150298, 0.77362351, 0.72638964, 0.73620173, 0.77128149,
0.72829976, 0.71833085, 0.72547543, 0.7765 , 0.73093579,
0.78290909, 0.75602133, 0.7435164 , 0.74004193, 0.79806763]), 'split4_test_score': array([0.72715573, 0.7374552 , 0.72204752, 0.67183463, 0.74561404,
0.71281465, 0.69026549, 0.69848901, 0.672657 , 0.76310273,
0.79633867, 0.70241546, 0.69010417, 0.71260684, 0.78287462,
0.70003434, 0.72827635, 0.71235834, 0.79304348, 0.69122257,
0.70264361, 0.77757901, 0.71018651, 0.74891775, 0.69050343,
0.73084677, 0.77898551, 0.77079108, 0.67410412, 0.72012579,
0.73763636, 0.61498708, 0.76884332, 0.80474352, 0.768221 ,
0.69111969, 0.7296883 , 0.77018025, 0.73112536, 0.61744166,
0.73708341, 0.70731465, 0.70909433, 0.75510204, 0.76 ,
0.71688034, 0.76429596, 0.75455047, 0.7297247 , 0.71036364]), 'split5_test_score': array([0.68155911, 0.76116771, 0.74032738, 0.74067633, 0.76814516,
0.7373913 , 0.70750135, 0.73832866, 0.7406994 , 0.74661681,
0.78389498, 0.72637681, 0.77527273, 0.76963636, 0.74563636,
0.67563636, 0.7726926 , 0.72607422, 0.7730559 , 0.72558515,
0.77688787, 0.71557045, 0.74904653, 0.82303585, 0.71439614,
0.68290909, 0.81636364, 0.64187328, 0.67595556, 0.71277429,
0.70363636, 0.65654545, 0.71755118, 0.72778257, 0.6268599 ,
0.67072464, 0.76177041, 0.75366695, 0.72039164, 0.72916667,
0.67569409, 0.65532373, 0.77182155, 0.75428356, 0.77367965,
0.78871091, 0.75654302, 0.77076803, 0.7265898 , 0.69317315]), 'split6_test_score': array([0.77461663, 0.76594027, 0.6100546 , 0.7082209 , 0.71869704,
0.67153159, 0.72637681, 0.71207729, 0.74995169, 0.72735116,
0.77678571, 0.74418605, 0.76485788, 0.7574657 , 0.79708333,
0.73430473, 0.65672727, 0.7897498 , 0.73922671, 0.70748299,
0.72909091, 0.73468661, 0.70571332, 0.77194357, 0.75520833,
0.68396577, 0.69633397, 0.75454545, 0.7302726 , 0.77863578,
0.73849879, 0.692904 , 0.774807 , 0.70144928, 0.71147979,
0.76426285, 0.72368421, 0.79015335, 0.72435897, 0.6851489 ,
0.75344732, 0.74706113, 0.75855072, 0.76299858, 0.85351158,
0.84187804, 0.85257014, 0.7628702 , 0.74396135, 0.76544622]), 'split7_test_score': array([0.76935065, 0.70269794, 0.70236364, 0.69403126, 0.6741453 ,
0.70251885, 0.74927273, 0.67448513, 0.70927273, 0.71231477,
0.69426638, 0.73015607, 0.85242878, 0.67987351, 0.80887491,
0.80356198, 0.68490909, 0.78330721, 0.7651158 , 0.77966796,
0.68805804, 0.75646552, 0.68306636, 0.74299517, 0.64576803,
0.60297483, 0.7350686 , 0.77446658, 0.75558036, 0.72410407,
0.74542334, 0.66007835, 0.77113906, 0.7641369 , 0.74051339,
0.71781994, 0.72311087, 0.77942429, 0.76833333, 0.62841273,
0.71949405, 0.6894686 , 0.77900349, 0.78127184, 0.67749023,
0.70649895, 0.76460569, 0.70562771, 0.67636811, 0.73417239]), 'split8_test_score': array([0.70479911, 0.64828093, 0.67788196, 0.73232596, 0.70571332,
0.69191016, 0.74498112, 0.71965368, 0.69719807, 0.75372024,
0.72453704, 0.72108844, 0.68744589, 0.76601831, 0.72127273,
0.74237351, 0.68162226, 0.66979167, 0.70005649, 0.80985789,
0.71961758, 0.78051106, 0.69940476, 0.72292103, 0.77076353,
0.73376507, 0.77366667, 0.77352335, 0.7312128 , 0.75173922,
0.73054545, 0.67522321, 0.79827191, 0.79783638, 0.738 ,
0.73436308, 0.69181034, 0.74763636, 0.76145455, 0.673226 ,
0.70682002, 0.61479786, 0.74275078, 0.73466505, 0.76841693,
0.7025 , 0.71817995, 0.78318841, 0.74515648, 0.79163636]), 'split9_test_score': array([0.765016 , 0.71310175, 0.70438712, 0.77367727, 0.73511905,
0.66982887, 0.66684473, 0.73604901, 0.72244754, 0.72292143,
0.83227053, 0.78299517, 0.75694444, 0.75099567, 0.71191866,
0.73659674, 0.75301313, 0.75319302, 0.74995031, 0.77687627,
0.73679315, 0.75247902, 0.70940171, 0.76816239, 0.72567008,
0.66285278, 0.73818182, 0.74637249, 0.71384773, 0.76271186,
0.71063305, 0.68638393, 0.76234326, 0.75747863, 0.77136752,
0.6995671 , 0.79389594, 0.71261356, 0.7366453 , 0.71774892,
0.67285024, 0.62541063, 0.75518182, 0.78756674, 0.77188676,
0.71036364, 0.74682846, 0.76554857, 0.71895833, 0.8076177 ]), 'mean_test_score': array([0.72914125, 0.7121187 , 0.69674512, 0.72644085, 0.71598015,
0.70246678, 0.70069738, 0.70524915, 0.71918769, 0.72685435,
0.75877994, 0.73545449, 0.75768985, 0.74156142, 0.75700244,
0.73010595, 0.7062618 , 0.74486796, 0.75121381, 0.75462856,
0.72569788, 0.74891882, 0.70982722, 0.75762187, 0.72177549,
0.67469821, 0.75088003, 0.73953902, 0.71778971, 0.74115471,
0.7368972 , 0.66439758, 0.76476691, 0.75252075, 0.71116461,
0.70524211, 0.73496669, 0.74839102, 0.74307916, 0.68113269,
0.70761601, 0.66948057, 0.76214212, 0.76764732, 0.75809551,
0.75497531, 0.76517554, 0.75657895, 0.71914997, 0.76212047]), 'std_test_score': array([0.03532285, 0.0464993 , 0.03451745, 0.04010837, 0.03386697,
0.01920759, 0.03236961, 0.02630063, 0.02509012, 0.02522005,
0.04066983, 0.01958846, 0.04646856, 0.02836228, 0.03100094,
0.03956659, 0.04224098, 0.03446856, 0.02965223, 0.03862465,
0.02757333, 0.02760122, 0.02326489, 0.03060641, 0.04889489,
0.04548082, 0.03419604, 0.04679499, 0.03232552, 0.02373119,
0.02264998, 0.02891037, 0.02546251, 0.03122795, 0.04522164,
0.03640214, 0.03431643, 0.02788697, 0.02881985, 0.05731613,
0.02620803, 0.04404795, 0.03134402, 0.01949279, 0.04392969,
0.0528932 , 0.03530596, 0.02445041, 0.02427927, 0.04018844]), 'rank_test_score': array([28, 37, 46, 30, 36, 44, 45, 42, 33, 29, 6, 25, 8, 21, 10, 27, 41,
19, 15, 13, 31, 17, 39, 9, 32, 48, 16, 23, 35, 22, 24, 50, 3, 14,
38, 43, 26, 18, 20, 47, 40, 49, 4, 1, 7, 12, 2, 11, 34, 5],
dtype=int32)}
print(bayes_search.best_score_)
0.7676473181792952
print(bayes_search.best_params_)
OrderedDict([('model__colsample_bytree', 0.22941721714014698), ('model__learning_rate', 0.01), ('model__max_depth', 7), ('model__n_estimators', 1750), ('model__subsample', 0.2218705814625066), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])
new_param_column_names = {
'model__max_depth': 'max_depth',
'model__n_estimators': 'n_estimators',
'model__learning_rate': 'learning_rate',
'model__colsample_bytree': 'colsample_bytree',
'model__subsample': 'subsample',
# 'pca__transformer': 'pca',
# 'pca__n_components': 'pca: n_comps',
# 'pca__gamma': 'pca: gamma',
# 'pca__kernel': 'pca: kernel',
'prep__non_numeric__encoder__transformer': 'encoder',
'prep__numeric__imputer__transformer': 'imputer',
'prep__numeric__scaler__transformer': 'scaler'
}
parser = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = new_param_column_names
)
parser.to_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
parser = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
parser.fit_time_averages
array([0.89816132, 3.0764225 , 0.82913582, 4.30177827, 1.92396953,
1.45830369, 1.4233129 , 1.88282964, 1.68454292, 0.18679664,
1.01264901, 0.14172795, 0.45804882, 2.67004008, 0.13521459,
1.36825943, 0.46995008, 0.65330856, 0.19113827, 0.47605207,
2.28726478, 0.19134943, 0.0461545 , 0.1843302 , 0.88133376,
0.03310969, 0.43091331, 0.23171203, 0.43778236, 0.34669089,
0.08219891, 0.48769341, 0.05991201, 0.16163681, 2.77276096,
1.46445477, 0.29412501, 1.91430132, 1.87365606, 0.02733583,
0.38780365, 0.02459159, 0.02633662, 1.27306516, 0.40463541,
2.03928359, 0.56104724, 0.35019045, 2.38269603, 0.28807607])
parser.best_primary_score
0.7676473181792952
parser.best_primary_score_params
{'colsample_bytree': 0.22941721714014698,
'learning_rate': 0.01,
'max_depth': 7,
'n_estimators': 1750,
'subsample': 0.2218705814625066,
'encoder': 'OneHotEncoder()',
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()'}
parser.to_formatted_dataframe(num_rows=20)
| roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | colsample_bytree | learning_rate | max_depth | n_estimators | subsample | encoder | scaler |
|---|---|---|---|---|---|---|---|---|---|
| 0.768 | 0.754 | 0.782 | 0.229 | 0.010 | 7 | 1,750 | 0.222 | OneHotEncoder() | StandardScaler() |
| 0.765 | 0.740 | 0.790 | 0.010 | 0.010 | 3 | 1,925 | 0.304 | OneHotEncoder() | StandardScaler() |
| 0.765 | 0.747 | 0.783 | 0.010 | 0.141 | 3 | 135 | 1.000 | OneHotEncoder() | StandardScaler() |
| 0.762 | 0.740 | 0.785 | 0.010 | 0.165 | 3 | 50 | 1.000 | OneHotEncoder() | StandardScaler() |
| 0.762 | 0.733 | 0.791 | 0.010 | 0.010 | 5 | 912 | 0.335 | OneHotEncoder() | StandardScaler() |
| 0.759 | 0.730 | 0.788 | 0.084 | 0.010 | 10 | 1,823 | 0.224 | OneHotEncoder() | MinMaxScaler() |
| 0.758 | 0.727 | 0.790 | 0.010 | 0.064 | 4 | 1,485 | 0.262 | OneHotEncoder() | StandardScaler() |
| 0.758 | 0.724 | 0.791 | 0.010 | 0.010 | 10 | 2,000 | 0.100 | CustomOrdinalEncoder() | StandardScaler() |
| 0.758 | 0.736 | 0.780 | 0.010 | 0.020 | 9 | 701 | 0.100 | CustomOrdinalEncoder() | StandardScaler() |
| 0.757 | 0.735 | 0.779 | 0.048 | 0.011 | 9 | 285 | 0.117 | OneHotEncoder() | MinMaxScaler() |
| 0.757 | 0.739 | 0.774 | 0.010 | 0.010 | 5 | 1,114 | 0.207 | OneHotEncoder() | None |
| 0.755 | 0.717 | 0.793 | 0.896 | 0.010 | 8 | 1,075 | 0.261 | OneHotEncoder() | MinMaxScaler() |
| 0.755 | 0.727 | 0.782 | 0.010 | 0.017 | 10 | 1,764 | 0.100 | CustomOrdinalEncoder() | None |
| 0.753 | 0.730 | 0.775 | 0.010 | 0.134 | 3 | 619 | 0.328 | OneHotEncoder() | StandardScaler() |
| 0.751 | 0.730 | 0.772 | 0.721 | 0.024 | 9 | 95 | 0.657 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.751 | 0.726 | 0.775 | 0.330 | 0.018 | 9 | 712 | 0.100 | OneHotEncoder() | StandardScaler() |
| 0.749 | 0.729 | 0.769 | 0.010 | 0.010 | 10 | 727 | 0.190 | OneHotEncoder() | MinMaxScaler() |
| 0.748 | 0.728 | 0.768 | 1.000 | 0.010 | 10 | 2,000 | 0.100 | OneHotEncoder() | None |
| 0.745 | 0.720 | 0.770 | 0.010 | 0.010 | 9 | 1,922 | 0.380 | OneHotEncoder() | StandardScaler() |
| 0.743 | 0.722 | 0.764 | 1.000 | 0.010 | 5 | 1,628 | 0.100 | OneHotEncoder() | None |
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
parser.primary_score_trial_ranking
array([28, 37, 46, 30, 36, 44, 45, 42, 33, 29, 6, 25, 8, 21, 10, 27, 41,
19, 15, 13, 31, 17, 39, 9, 32, 48, 16, 23, 35, 22, 24, 50, 3, 14,
38, 43, 26, 18, 20, 47, 40, 49, 4, 1, 7, 12, 2, 11, 34, 5])
# gives the
# e.g. parser.primary_score_iteration_ranking of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because index 2 (i.e. 3rd iteration) was the best, so it is the first index;
# and index 0 (i.e. first iteration) was the was
parser.primary_score_best_indexes
array([43, 46, 32, 42, 49, 10, 44, 12, 23, 14, 47, 45, 19, 33, 18, 26, 21,
37, 17, 38, 13, 29, 27, 30, 11, 36, 15, 0, 9, 3, 20, 24, 8, 48,
28, 4, 1, 34, 22, 40, 16, 7, 35, 5, 6, 2, 39, 25, 41, 31])
parser.plot_performance_across_trials().show()
parser.plot_performance_across_trials(size='learning_rate', color='max_depth').show()
parser.plot_performance_across_trials(size='learning_rate', color='encoder').show()
parser.plot_parameter_values_across_trials().show()
parser.plot_scatter(height=1000, width=1000 * hlp.plot.GOLDEN_RATIO).show()
parser.plot_performance_numeric_params(height=800)
parser.plot_parallel_coordinates().show()
parser.plot_performance_non_numeric_params()
parser.plot_score_vs_parameter(
parameter='learning_rate',
size='colsample_bytree',
color='scaler'
)
parser.plot_parameter_vs_parameter(parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='max_depth'
)
parser.plot_parameter_vs_parameter(parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='imputer')
roc_auc Mean¶score_variable = parser.primary_score_name + ' Mean'
score_dataframe = parser.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + parser.parameter_names])
score_dataframe.head()
| roc_auc Mean | colsample_bytree | learning_rate | max_depth | n_estimators | subsample | encoder | scaler | |
|---|---|---|---|---|---|---|---|---|
| 43 | 0.767647 | 0.229417 | 0.010000 | 7 | 1750 | 0.221871 | OneHotEncoder() | StandardScaler() |
| 46 | 0.765176 | 0.010000 | 0.010000 | 3 | 1925 | 0.304311 | OneHotEncoder() | StandardScaler() |
| 32 | 0.764767 | 0.010000 | 0.141191 | 3 | 135 | 1.000000 | OneHotEncoder() | StandardScaler() |
| 42 | 0.762142 | 0.010000 | 0.164514 | 3 | 50 | 1.000000 | OneHotEncoder() | StandardScaler() |
| 49 | 0.762120 | 0.010000 | 0.010000 | 5 | 912 | 0.334520 | OneHotEncoder() | StandardScaler() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'colsample_bytree': 'colsample_bytree',
'learning_rate': 'learning_rate',
'max_depth': 'max_depth',
'n_estimators': 'n_estimators',
'subsample': 'subsample',
'encoder': 'encoder',
'scaler': 'scaler'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ colsample_bytree + learning_rate + max_depth + n_estimators + subsample + encoder + scaler
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.447
Model: OLS Adj. R-squared: 0.339
Method: Least Squares F-statistic: 4.137
Date: Sun, 30 Jan 2022 Prob (F-statistic): 0.00109
Time: 19:16:27 Log-Likelihood: 125.56
No. Observations: 50 AIC: -233.1
Df Residuals: 41 BIC: -215.9
Df Model: 8
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept 0.7531 0.018 41.643 0.000 0.717 0.790
encoder[T.OneHotEncoder()] 0.0120 0.007 1.712 0.095 -0.002 0.026
scaler[T.None] -0.0170 0.009 -1.986 0.054 -0.034 0.000
scaler[T.StandardScaler()] -0.0067 0.009 -0.789 0.435 -0.024 0.011
colsample_bytree -0.0167 0.009 -1.919 0.062 -0.034 0.001
learning_rate -0.1490 0.042 -3.560 0.001 -0.234 -0.064
max_depth -0.0009 0.001 -0.628 0.533 -0.004 0.002
n_estimators 5.539e-06 4.92e-06 1.125 0.267 -4.4e-06 1.55e-05
subsample -0.0068 0.010 -0.649 0.520 -0.028 0.014
==============================================================================
Omnibus: 13.176 Durbin-Watson: 0.738
Prob(Omnibus): 0.001 Jarque-Bera (JB): 14.950
Skew: -1.005 Prob(JB): 0.000567
Kurtosis: 4.771 Cond. No. 1.79e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.79e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'colsample_bytree', 'learning_rate', 'max_depth', 'n_estimators', 'subsample'] ['encoder', 'scaler']
| roc_auc_Mean | colsample_bytree | learning_rate | max_depth | n_estimators | subsample | encoder | scaler | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.406237 | -0.396553 | -0.919358 | 0.138245 | 0.953792 | -0.775493 | OneHotEncoder() | StandardScaler() |
| 1 | 1.312621 | -0.924408 | -0.919358 | -1.316965 | 1.209097 | -0.531473 | OneHotEncoder() | StandardScaler() |
| 2 | 1.297145 | -0.924408 | 0.531053 | -1.316965 | -1.402312 | 1.527734 | OneHotEncoder() | StandardScaler() |
| 3 | 1.197734 | -0.924408 | 0.7889 | -1.316965 | -1.526317 | 1.527734 | OneHotEncoder() | StandardScaler() |
| 4 | 1.196914 | -0.924408 | -0.919358 | -0.58936 | -0.268756 | -0.442055 | OneHotEncoder() | StandardScaler() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['colsample_bytree'] = score_dataframe_transformed['colsample_bytree'].astype('float')
score_dataframe_transformed['learning_rate'] = score_dataframe_transformed['learning_rate'].astype('float')
score_dataframe_transformed['max_depth'] = score_dataframe_transformed['max_depth'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
score_dataframe_transformed['subsample'] = score_dataframe_transformed['subsample'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ colsample_bytree + learning_rate + max_depth + n_estimators + subsample + encoder + scaler
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.447
Model: OLS Adj. R-squared: 0.339
Method: Least Squares F-statistic: 4.137
Date: Sun, 30 Jan 2022 Prob (F-statistic): 0.00109
Time: 19:16:29 Log-Likelihood: -56.153
No. Observations: 50 AIC: 130.3
Df Residuals: 41 BIC: 147.5
Df Model: 8
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept -0.0315 0.280 -0.113 0.911 -0.596 0.533
encoder[T.OneHotEncoder()] 0.4558 0.266 1.712 0.095 -0.082 0.994
scaler[T.None] -0.6433 0.324 -1.986 0.054 -1.297 0.011
scaler[T.StandardScaler()] -0.2552 0.323 -0.789 0.435 -0.908 0.398
colsample_bytree -0.2626 0.137 -1.919 0.062 -0.539 0.014
learning_rate -0.5105 0.143 -3.560 0.001 -0.800 -0.221
max_depth -0.0906 0.144 -0.628 0.533 -0.382 0.201
n_estimators 0.1438 0.128 1.125 0.267 -0.114 0.402
subsample -0.0867 0.134 -0.649 0.520 -0.357 0.183
==============================================================================
Omnibus: 13.176 Durbin-Watson: 0.738
Prob(Omnibus): 0.001 Jarque-Bera (JB): 14.950
Skew: -1.005 Prob(JB): 0.000567
Kurtosis: 4.771 Cond. No. 5.47
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | 0.455777 | 0.094516 | False |
| scaler[T.None] | scaler[T.None] | -0.643305 | 0.053695 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | -0.255160 | 0.434678 | False |
| colsample_bytree | colsample_bytree | -0.262561 | 0.061963 | False |
| learning_rate | learning_rate | -0.510478 | 0.000954 | True |
| max_depth | max_depth | -0.090611 | 0.533413 | False |
| n_estimators | n_estimators | 0.143790 | 0.267172 | False |
| subsample | subsample | -0.086664 | 0.520216 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
estimator = bayes_search.best_estimator_
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 9.365 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.